#### PREAMBLE ####

library(ggplot2)
library(geosphere)
library(dplyr)
library(data.table)
library(tidyr)
library(zoo)
library(sf)
library(ggspatial)
library(RColorBrewer)
library(estimatr)
library(broom)
library(texreg)
library(did)


#### LOAD DATA ####

set.seed(20230711)

#load main data
df <- read.csv("Outputs/Analysis all muns.csv", stringsAsFactors = F, na.strings = c("", "NA"))
df$INEGI <- sprintf("%05d", df$INEGI)
df$INEGI <- as.character(df$INEGI)
df$time <- as.Date(df$time)

#### TRANSFORMATION ####

#states treated by US trade restrictions
unique(df$State[df$ustrade==1])
df$ustrade[df$time>=as.Date("2016-06-01") & df$trade==1 & (df$State=="MICHOACAN")] <- 1
#updating ustrade.
df$ustrade <- df$ustrade * df$trade

#### LOAD AND JOIN PUBLIC EXPENDITURE DATA ####

# Load necessary library
library(data.table) # Using data.table for faster file reading

# Define the path to the folder containing the CSV files
folder_path <- "Source data/Public expenditure/conjunto_de_datos"

# Define a pattern to match files ending with the years 2011 to 2019
year_pattern <- paste0("(", paste(2011:2019, collapse = "|"), ")\\.csv$")

# Get a list of all CSV files in the folder that match the year pattern
csv_files <- list.files(path = folder_path, pattern = year_pattern, full.names = TRUE)

# Load all matching CSV files into a list
csv_list <- lapply(csv_files, fread) # fread is used from data.table package for faster reading

combined_data <- rbindlist(csv_list)
gc()

spend <- combined_data
rm(combined_data, csv_list)
gc()

spend$ID_ENTIDAD <- sprintf("%02d", spend$ID_ENTIDAD)
spend$ID_MUNICIPIO <- sprintf("%03d", spend$ID_MUNICIPIO)
spend$INEGI <- paste0(spend$ID_ENTIDAD, spend$ID_MUNICIPIO)
spend$category <- paste0(spend$TEMA, spend$CATEGORIA, spend$DESCRIPCION_CATEGORIA)
spend$category <- gsub(" ", "", spend$category)
gc()

spend <- spend[,c("INEGI", "ANIO", "VALOR", "category")]
gc()

spend <- spend[spend$INEGI %in% df$INEGI,]

check <- spend %>% 
  dplyr::group_by(INEGI, ANIO, category) %>%
  dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
  dplyr::filter(n > 1L) 

spend <- spend[!spend$category %in% check$category,]

rm(check)
gc()

wide_format <- spend %>%
  pivot_wider(values_from="VALOR", names_from = "category")

rm(spend)
gc()

#summarize up to municipal level
variables <- colnames(wide_format)[3:52]
average_data <- wide_format %>%
  group_by(INEGI) %>%
  summarize_at(variables, mean, na.rm = TRUE)

spend <- average_data

rm(average_data, wide_format)
gc()


#### SUMMARIZE AND MERGE DF ####

#summarize up to municipal level
variables <- c("long", "lat", "trade", "min_dist", "malicious_homicides")
df[variables] <- lapply(df[variables], function(x) {
  if(!is.numeric(x)) as.numeric(gsub(",", "", x)) else x
})
average_data <- df %>%
  group_by(INEGI) %>%
  summarize_at(variables, mean, na.rm = TRUE)

for (col in colnames(average_data)) {
  average_data[[col]][is.nan(average_data[[col]])] <- NA_real_
}

average_data$trade[average_data$trade>0] <- 1

#combine datasets
df <- left_join(average_data, spend, by=c("INEGI"="INEGI"))

objects <- ls()
rm(list = objects[objects != "df"])
rm(objects)
gc()

#### PROPENSITY SCORE MATCHING ####

library(MatchIt)
library(cobalt) 

#pare down to remove nas and variables with too much missing
df <- df[!df$min_dist=="Inf",]
na_counts <- colSums(is.na(df))

df <- df[, colSums(is.na(df)) <= 200] #subset to variables with less than 200 nas

na_counts <- colSums(is.na(df[df$trade==1,])) #check nas where trade ==1

df <- df[!(df$trade == 0 & rowSums(is.na(df)) > 0), ] #remove incomplete rows where trade is zero

df <- df[, colSums(is.na(df)) <= 1] #subset to variables with 1 or fewer nas

na_counts <- colSums(is.na(df))

#subset to complete rows
df <- df[!(rowSums(is.na(df)) > 0), ]
na_counts <- colSums(is.na(df))
sum(na_counts)==0

#set all to numeric
df[-which(names(df) == "INEGI")] <- lapply(df[-which(names(df) == "INEGI")], as.numeric)

colnames(df)[9] <- "EgresosConceptoRemuneraciones_al_personal"
colnames(df)[11] <- "EgresosConceptoMaterialesdeadministración_emisióndedocumentosyartículosoficiales"
colnames(df)[14] <- "EgresosConceptoServiciosfinancieros_bancariosycomerciales"
colnames(df)[15] <- "EgresosConceptoServiciosdeinstalación_reparación_mantenimientoyconservación"


treatment <- df$trade
covariates <- df[, !colnames(df) %in% c("INEGI", "trade", "malicious_homicides")]
outcome <- df$malicious_homicides

# Estimate propensity scores using logistic regression
ps_model <- glm(trade ~ long + lat + min_dist + EgresosTemaTotaldeegresos + 
                  EgresosCapítuloServiciospersonales + EgresosConceptoRemuneraciones_al_personal + 
                  EgresosCapítuloMaterialesysuministros + EgresosConceptoMaterialesdeadministración_emisióndedocumentosyartículosoficiales + 
                  EgresosCapítuloServiciosgenerales + EgresosConceptoServiciosbásicos + 
                  EgresosConceptoServiciosfinancieros_bancariosycomerciales + 
                  EgresosConceptoServiciosdeinstalación_reparación_mantenimientoyconservación + 
                  EgresosConceptoServiciosoficiales, data = df, family = binomial)

# Add propensity scores to the dataset
df$pscore <- predict(ps_model, type = "response")




# Perform propensity score matching
match_it <- matchit(trade ~ long + lat + min_dist + EgresosTemaTotaldeegresos + 
                      EgresosCapítuloServiciospersonales + EgresosConceptoRemuneraciones_al_personal + 
                      EgresosCapítuloMaterialesysuministros + EgresosConceptoMaterialesdeadministración_emisióndedocumentosyartículosoficiales + 
                      EgresosCapítuloServiciosgenerales + EgresosConceptoServiciosbásicos + 
                      EgresosConceptoServiciosfinancieros_bancariosycomerciales + 
                      EgresosConceptoServiciosdeinstalación_reparación_mantenimientoyconservación + 
                      EgresosConceptoServiciosoficiales,
                    data = df,
                    method = "nearest", distance = "logit")

# Extract matched data
matched_data <- match.data(match_it)



### ASSESS ###

# Summary of the matching result
summary(match_it)

# Check balance before and after matching
bal <- bal.tab(match_it, un = TRUE)
bal


#### SUBSET FOR REGRESSION ####

#save inegis to use
write.csv(matched_data[,c("INEGI")], "Outputs/propensity_matched_muns.csv", row.names = F)

rm(list=ls())
gc()
